## Load the libraries
library(tidytuesdayR)
library(ggplot2)
library(plotly)
library(dplyr)
library(tidyr)
library(corrplot)
library(corrgram)
library(usmap)
library(moments)
library(animation)
## Pull the datasets
tuition_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/tuition_cost.csv')

tuition_income <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/tuition_income.csv') 

salary_potential <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/salary_potential.csv')

historical_tuition <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/historical_tuition.csv')

diversity_school <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/diversity_school.csv')
#Modify year 
historical_tuition$yearmod <- sapply(strsplit(historical_tuition$year, split='-', fixed=TRUE), function(x) (x[1]))

#Create a subset for All Institutions
historical_tuitionALL <- subset(historical_tuition, historical_tuition$type == "All Institutions")

#Create subsets for all, 2, and 4 year institutions
histall <- subset(historical_tuitionALL, historical_tuitionALL$tuition_type == "All Constant")
hist2year <- subset(historical_tuitionALL, historical_tuitionALL$tuition_type == "2 Year Constant")
hist4year <- subset(historical_tuitionALL, historical_tuitionALL$tuition_type == "4 Year Constant")
## Plot tutition cost over time for all institutions
p <- ggplot(histall, aes(yearmod, tuition_cost)) +
  labs(x = "Year", y="Tuition Cost", title = "All Institution Cost over Time") +
  theme(axis.text.x = element_text(angle = 90))+
  geom_col()
ggplotly(p)
## Plot tutition cost over time for 2 year institutions
p <- ggplot(hist2year, aes(yearmod, tuition_cost)) +
  labs(x = "Year", y="Tuition Cost", title = "2 Year Institution Cost over Time") +
  theme(axis.text.x = element_text(angle = 90))+
  geom_col()
ggplotly(p)
## Plot tutition cost over time for 4 year institutions
p <- ggplot(hist4year, aes(yearmod, tuition_cost)) +
  labs(x = "Year", y="Tuition Cost", title = "4 Year Institution Cost over Time") +
  theme(axis.text.x = element_text(angle = 90))+
  geom_col()
ggplotly(p)
  #create line graph to compare the historical trend of tuition costs across different universities 
hist_tuition <- historical_tuition %>%
  filter(tuition_type == "All Constant" | tuition_type == "4 Year Constant" | tuition_type == "2 Year Constant") %>%
  filter(type == "All Institutions")
ts_plot <- ggplot(data = hist_tuition, aes(x = year, group = tuition_type)) + 
  geom_line(aes(y = tuition_cost, color = tuition_type), size = 1) + 
  ggtitle("Historical Costs of College \n Adjusted for Inflation") + 
  theme(axis.text.x = element_text(angle = 90)) + 
  xlab("Year") + 
  ylab("College Tuition Cost") + 
  labs(color = "Type of Degree")
  
ggplotly(ts_plot)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Tuition costs have continued to increase over the years, especially in for 4 year institutions, even when controlling for inflation. From 1985 to 2016 the cost of 2 year institutions has gone up $3,090 whereas the cost of 4 year institutions has risen $14,319. There’s a significant difference in the increase of price between the two.

## replace _ with ,
tuition_income$income_lvl <- replace(tuition_income$income_lvl,tuition_income$income_lvl=="48_001 to 75,000","48,001 to 75,000")
lowestincome <- subset(tuition_income, tuition_income$income_lvl == "0 to 30,000")
lowincome <- subset(tuition_income, tuition_income$income_lvl == "30,001 to 48,000")
midincome <- subset(tuition_income, tuition_income$income_lvl == "48,001 to 75,000")
highincome <- subset(tuition_income, tuition_income$income_lvl == "75,001 to 110,000")
highestincome <- subset(tuition_income, tuition_income$income_lvl == "Over 110,000")
  #boxplot of income levels and net costs 
p <- ggplot(tuition_income, aes(x = income_lvl, y = net_cost, fill = income_lvl)) +
  geom_boxplot()+
  labs(x = "Income Levels", y="Net Cost", title = "Income Levels and Net Costs Compared", fill = "Income Level")+
  theme(axis.text.x = element_text(angle = 90)) 
ggplotly(p)
## Merge salary potential and tuition cost datasets by name of college
potential_salary <- merge(salary_potential,tuition_cost,by=c("name"))
  #ggplot for tuition vs career pay 
p <- ggplot(potential_salary, aes(x = in_state_tuition, y = mid_career_pay, fill = type,label = name)) +
  geom_point()+
  labs(x = "In State Tuition Cost", y="Estimated Mid Career Pay", title = "In State vs. Estimated Mid Career Pay for Types of Instituitions Costs Compared", fill = "Type of College") 
ggplotly(p)
## Group in state tution into differnt levels
potential_salary <- potential_salary %>% 
  mutate(salary_group = case_when(
    between(in_state_tuition, 0, 10000) ~ "0 to 10000",
    between(in_state_tuition, 10000, 20000) ~ "10000 to 20000",
    between(in_state_tuition, 20000, 30000) ~ "20000 to 30000",
    between(in_state_tuition, 30000, 40000) ~ "30000 to 40000",
    between(in_state_tuition, 40000, 50000) ~ "40000 to 50000",
    between(in_state_tuition, 50000, 60000) ~ "50000 to 60000",
    between(in_state_tuition, 60000, 70000) ~ "60000 to 70000",
    TRUE ~ NA_character_
  ))
p <- ggplot(potential_salary, aes(x = salary_group, y = mid_career_pay, fill = type)) +
  geom_boxplot()+
  labs(x = "In State Tuition", y="Estimated Mid Career Pay", title = "In State Tuition and Mid Career Pay Compared", fill = "Type of College")+
  theme(axis.text.x = element_text(angle = 45))
ggplotly(p)
cor(potential_salary$in_state_tuition,potential_salary$mid_career_pay)
## [1] 0.5134
cor(potential_salary$in_state_tuition,potential_salary$early_career_pay)
## [1] 0.4781204
## Group in STEM percent into differnt levels
potential_salary <- potential_salary %>% 
  mutate(stem_group = case_when(
    between(stem_percent, 0, 10) ~ "0 to 10",
    between(stem_percent, 10, 20) ~ "10 to 20",
    between(stem_percent, 20, 30) ~ "20 to 30",
    between(stem_percent, 30, 40) ~ "30 to 40",
    between(stem_percent, 40, 50) ~ "40 to 50",
    between(stem_percent, 50, 60) ~ "50 to 60",
    between(stem_percent, 60, 70) ~ "60 to 70",
    between(stem_percent, 70, 80) ~ "70 to 80",
    between(stem_percent, 80, 90) ~ "80 to 90",
    between(stem_percent, 90, 100) ~ "90 to 100",
    TRUE ~ NA_character_
  ))
p <- ggplot(potential_salary, aes(x = stem_group, y = mid_career_pay, fill = type)) +
  geom_boxplot()+
  labs(x = "STEM Percent", y="Estimated Mid Career Pay", title = "STEM Percent and Mid Career Pay Compared", fill = "Type of School")+
  theme(axis.text.x = element_text(angle = 45))
ggplotly(p)
## Group in STEM percent into differnt levels
potential_salary <- potential_salary %>% 
  mutate(makebetter_group = case_when(
    between(make_world_better_percent, 0, 10) ~ "0 to 10",
    between(make_world_better_percent, 10, 20) ~ "10 to 20",
    between(make_world_better_percent, 20, 30) ~ "20 to 30",
    between(make_world_better_percent, 30, 40) ~ "30 to 40",
    between(make_world_better_percent, 40, 50) ~ "40 to 50",
    between(make_world_better_percent, 50, 60) ~ "50 to 60",
    between(make_world_better_percent, 60, 70) ~ "60 to 70",
    between(make_world_better_percent, 70, 80) ~ "70 to 80",
    between(make_world_better_percent, 80, 90) ~ "80 to 90",
    between(make_world_better_percent, 90, 100) ~ "90 to 100",
    TRUE ~ "Not Reported"
  ))
p <- ggplot(potential_salary, aes(x = makebetter_group, y = mid_career_pay, fill = type)) +
  geom_boxplot()+
  labs(x = "Feel Make World Better Percent", y="Estimated Mid Career Pay", title = "Make World Better Percent and Mid Career Pay Compared")+
  theme(axis.text.x = element_text(angle = 45))
ggplotly(p)
  #read in more college data 
library(readr)
College_Data <- read_csv("College_Data.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   UNITID = col_double(),
##   OPEID = col_double(),
##   OPEID6 = col_double(),
##   HCM2 = col_double(),
##   MAIN = col_double(),
##   PREDDEG = col_double(),
##   HIGHDEG = col_double(),
##   CONTROL = col_double(),
##   ST_FIPS = col_double(),
##   REGION = col_double(),
##   LOCALE = col_double(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   CCBASIC = col_double(),
##   CCUGPROF = col_double(),
##   CCSIZSET = col_double(),
##   DISTANCEONLY = col_double(),
##   ICLEVEL = col_double()
## )
## See spec(...) for full column specifications.
## Warning: 7 parsing failures.
##  row       col expected actual               file
## 5345 LOCALE    a double   NULL 'College_Data.csv'
## 5345 LATITUDE  a double   NULL 'College_Data.csv'
## 5345 LONGITUDE a double   NULL 'College_Data.csv'
## 5345 CCBASIC   a double   NULL 'College_Data.csv'
## 5345 CCUGPROF  a double   NULL 'College_Data.csv'
## .... ......... ........ ...... ..................
## See problems(...) for more details.
  #data wrangling
  
  #select variables 
college_df <- College_Data %>% 
  select(INSTNM, ADM_RATE, ACTCMMID, SAT_AVG, INEXPFTE, C150_4,ICLEVEL, MD_EARN_WNE_P6, DEBT_MDN, GRAD_DEBT_MDN)

college_df$ADM_RATE <- as.double(college_df$ADM_RATE)
## Warning: NAs introduced by coercion
college_df$ACTCMMID <- as.double(college_df$ACTCMMID)
## Warning: NAs introduced by coercion
college_df$SAT_AVG <- as.double(college_df$SAT_AVG)
## Warning: NAs introduced by coercion
college_df$INEXPFTE <- as.double(college_df$INEXPFTE)
## Warning: NAs introduced by coercion
college_df$C150_4 <- as.double(college_df$C150_4)
## Warning: NAs introduced by coercion
college_df$MD_EARN_WNE_P6 <- as.double(college_df$MD_EARN_WNE_P6)
## Warning: NAs introduced by coercion
college_df$DEBT_MDN <- as.double(college_df$DEBT_MDN)
## Warning: NAs introduced by coercion
college_df$GRAD_DEBT_MDN <- as.double(college_df$GRAD_DEBT_MDN)
## Warning: NAs introduced by coercion
  #update name of college for joining 
colnames(college_df)[colnames(college_df) == "INSTNM"] <- "name"

  #select variables 
col1 <- tuition_cost %>%
  select(c(name, room_and_board, in_state_tuition, out_of_state_tuition))
col2<- salary_potential %>%
  select(c(mid_career_pay, make_world_better_percent, stem_percent, name))

  #joing datasets 
col <- inner_join(col1, col2, by = "name")
college_factors <- inner_join(col, college_df, by = "name")
  #view summary of data 
summary(college_factors)
##      name           room_and_board  in_state_tuition out_of_state_tuition
##  Length:716         Min.   : 1698   Min.   : 4118    Min.   : 4118       
##  Class :character   1st Qu.: 8948   1st Qu.:10021    1st Qu.:20754       
##  Mode  :character   Median :10550   Median :26000    Median :29188       
##                     Mean   :10924   Mean   :26597    Mean   :30862       
##                     3rd Qu.:12880   3rd Qu.:40708    3rd Qu.:41038       
##                     Max.   :18156   Max.   :58230    Max.   :58230       
##                     NA's   :35                                           
##  mid_career_pay   make_world_better_percent  stem_percent       ADM_RATE     
##  Min.   : 60100   Min.   :33.00             Min.   :  0.00   Min.   :0.0436  
##  1st Qu.: 81100   1st Qu.:48.00             1st Qu.:  7.00   1st Qu.:0.5458  
##  Median : 89100   Median :52.00             Median : 12.50   Median :0.6895  
##  Mean   : 92114   Mean   :53.48             Mean   : 16.55   Mean   :0.6527  
##  3rd Qu.:100300   3rd Qu.:58.00             3rd Qu.: 22.00   3rd Qu.:0.8114  
##  Max.   :158200   Max.   :94.00             Max.   :100.00   Max.   :1.0000  
##                   NA's   :28                                 NA's   :58      
##     ACTCMMID        SAT_AVG        INEXPFTE          C150_4          ICLEVEL 
##  Min.   :17.00   Min.   : 925   Min.   :     0   Min.   :0.0000   Min.   :1  
##  1st Qu.:22.00   1st Qu.:1092   1st Qu.:  7466   1st Qu.:0.4503   1st Qu.:1  
##  Median :24.00   Median :1155   Median :  9890   Median :0.5941   Median :1  
##  Mean   :24.61   Mean   :1180   Mean   : 12814   Mean   :0.5945   Mean   :1  
##  3rd Qu.:27.00   3rd Qu.:1239   3rd Qu.: 14158   3rd Qu.:0.7320   3rd Qu.:1  
##  Max.   :36.00   Max.   :1566   Max.   :108509   Max.   :1.0000   Max.   :1  
##  NA's   :148     NA's   :143                     NA's   :14                  
##  MD_EARN_WNE_P6      DEBT_MDN     GRAD_DEBT_MDN  
##  Min.   : 17400   Min.   : 4563   Min.   : 5600  
##  1st Qu.: 31200   1st Qu.:14000   1st Qu.:20711  
##  Median : 35650   Median :16750   Median :23750  
##  Mean   : 37775   Mean   :16874   Mean   :23078  
##  3rd Qu.: 42000   3rd Qu.:19500   3rd Qu.:26256  
##  Max.   :112100   Max.   :29500   Max.   :40000  
##  NA's   :10       NA's   :4       NA's   :5
  #take out unnesseary values 
college_pc <- college_factors %>%
  select(-c(name, ICLEVEL))

  #remove NA and replace with 0s 
college_pc[is.na(college_pc)] <- 0
principle_components <- prcomp(college_pc, scale. = T)

  #look at principle compenents
principle_components$rotation[1:5,1:4]
##                                   PC1         PC2         PC3        PC4
## room_and_board             0.31734579 -0.14009048  0.06844336  0.1722346
## in_state_tuition           0.33778060 -0.02410867  0.29070941  0.1835468
## out_of_state_tuition       0.37625596 -0.01076422  0.21450991  0.1581642
## mid_career_pay             0.36371271  0.17353522 -0.01947342 -0.1821042
## make_world_better_percent -0.08375642  0.02192628 -0.14910722 -0.6563008
std_dev <- principle_components$sdev
pr_var <- std_dev^2
prop_varex <- pr_var/sum(pr_var)

  #plot changing variance 
plot(cumsum(prop_varex), xlab = "Principle Component", ylab = "Cummulative Proportion of Variance Explained", type = "b")
abline(h = .95, col = "grey")

sum(prop_varex[1:9])
## [1] 0.9533869

With 9 principle components, 95% of the variance is explained.

  #get principle components
college_pc <- cbind(college_pc, principle_components$x)
college_pc <- as.data.frame(college_pc[, 15:23])
head(college_pc)
##          PC1        PC2        PC3        PC4        PC5        PC6        PC7
## 1 -2.3852394 -0.1951275 -1.3344202  0.2938433 -0.4548928  0.4556749 -0.5327095
## 2  0.2197772 -0.5067813  2.9680314 -0.3789071 -0.4780764  1.3683271  0.4793397
## 3 -2.6619101 -2.1818263 -0.2248277 -0.9136863  0.2041236 -0.2415222  0.8206367
## 4 -2.1079738  0.6094843  1.2866421 -0.8744944 -0.6563394  1.2337956 -0.9281638
## 5  3.7381933 -0.4000781  0.2602222 -5.2904344 -0.3734207 -1.6273135 -3.9086511
## 6  0.3902363 -1.6371283  1.0612125 -0.6332616 -0.9723017 -0.4513086 -0.2816824
##          PC8        PC9
## 1 -0.8361891  0.2502482
## 2 -1.0519618 -0.3316535
## 3 -1.4647345 -0.2131491
## 4 -0.5627559 -0.7413327
## 5  1.7037832  0.1626732
## 6  0.6677532  0.3050402
set.seed(2345)
  #create formula 
kmean_withinss <- function(k) {
    cluster <- kmeans(college_pc, k)
    return (cluster$tot.withinss)
}
  
  #look at best number of clusters 
max_k <- 20 
wss <- sapply(2:max_k, kmean_withinss)
elbow <- data.frame(2:max_k, wss)

ggplot(elbow, aes(x = X2.max_k, y = wss)) +
    geom_point() +
    geom_line() +
    scale_x_continuous(breaks = seq(1, 20, by = 1))

We will choose 8 as the optimal cluster.

  #examine clusters 
pc_cluster <- kmeans(college_pc, 8)
pc_cluster$cluster
##   [1] 3 4 3 5 8 8 8 3 7 8 6 1 7 8 3 7 5 5 6 8 3 8 3 8 3 3 1 3 5 1 3 3 7 4 8 5 8
##  [38] 6 6 3 3 8 8 6 3 4 8 3 8 3 3 3 3 1 1 4 5 8 8 3 3 3 1 4 1 8 1 6 3 3 8 8 1 1
##  [75] 8 8 3 8 1 3 3 8 7 7 8 3 3 3 8 6 3 8 7 3 6 8 8 1 3 6 4 8 3 8 3 8 3 3 8 1 4
## [112] 1 8 3 5 4 5 4 3 1 3 3 3 8 3 1 3 8 1 4 8 8 3 3 3 1 1 3 3 8 4 8 4 3 5 8 4 8
## [149] 3 1 8 3 3 3 5 3 3 3 3 8 8 8 1 3 3 4 3 3 8 3 3 3 3 3 8 3 3 8 5 3 3 8 3 3 3
## [186] 3 8 8 3 3 3 4 6 7 5 8 4 3 6 3 1 3 4 1 8 8 5 8 3 1 3 1 3 3 8 7 8 8 7 4 8 3
## [223] 3 6 8 3 8 3 3 3 3 3 8 8 3 8 1 3 4 4 5 3 3 3 3 3 3 3 1 7 8 4 4 1 4 3 2 3 3
## [260] 6 8 8 1 5 8 4 8 3 3 3 6 8 3 8 3 3 8 8 4 8 8 3 3 8 7 5 8 5 7 8 8 8 3 8 8 3
## [297] 3 1 8 3 8 3 3 8 2 8 8 4 7 3 6 3 8 8 6 3 3 3 1 3 3 6 3 8 8 8 3 3 3 3 3 8 6
## [334] 8 3 5 3 8 3 3 3 3 8 8 5 3 7 8 3 5 5 5 8 6 8 6 1 3 3 3 3 8 3 1 5 3 3 3 3 3
## [371] 3 3 3 6 3 6 3 1 4 8 8 3 3 8 8 8 3 4 3 3 3 6 3 3 3 2 3 8 3 8 7 8 6 6 3 7 3
## [408] 5 1 3 8 5 1 3 8 8 8 5 8 8 1 8 1 3 8 8 1 8 8 5 8 8 3 4 6 4 1 3 2 6 4 3 8 3
## [445] 8 8 8 8 8 8 4 8 8 5 4 8 3 1 8 8 8 8 8 3 3 3 8 4 3 8 3 3 3 3 5 3 1 6 5 3 3
## [482] 3 5 3 3 8 3 8 8 7 7 7 7 7 7 8 1 3 3 5 3 5 4 1 8 3 4 8 1 3 8 6 3 8 3 2 8 7
## [519] 5 6 7 3 3 8 3 8 1 8 3 3 1 3 8 7 3 3 5 3 3 3 3 6 3 2 3 8 3 3 3 3 3 1 8 3 8
## [556] 8 8 8 8 8 3 8 8 8 8 3 3 6 3 3 3 3 8 3 8 3 3 3 3 3 3 3 6 5 3 6 5 3 3 3 1 3
## [593] 3 3 8 2 8 8 3 3 8 3 3 3 3 3 3 3 1 8 1 6 8 4 8 1 1 8 8 8 3 8 3 3 3 1 3 3 3
## [630] 5 8 3 8 3 8 3 3 3 3 8 3 3 3 3 3 3 3 3 3 8 3 6 3 3 8 1 5 1 3 8 8 3 3 8 4 8
## [667] 8 6 3 1 4 3 6 3 1 6 3 1 8 3 4 3 3 3 6 3 3 5 6 3 3 8 6 5 3 1 8 3 8 4 3 8 3
## [704] 5 1 6 3 3 8 4 8 4 8 3 1 8
pc_cluster$centers
##         PC1         PC2          PC3         PC4         PC5         PC6
## 1  4.827956  1.91841731 -0.428070580  0.24014651 -0.12111590 -0.25645429
## 2 -1.999867  4.71555786  1.304853532 -3.72829083 -0.89853133 -1.92865825
## 3 -1.187859 -0.36236013 -0.933205083 -0.03641196 -0.11016925  0.01717884
## 4  1.077969  0.09477732  3.152451490  0.12882713 -0.19064639  1.21091244
## 5 -1.969200  0.46210438  1.618518263 -0.14620149 -0.31774188  1.13539565
## 6 -3.689187  2.52431459  1.286423221 -0.30319942  0.19619910 -0.40404993
## 7 -1.197118 -0.02780196  1.327315833  2.52929662  2.40137745 -1.35098259
## 8  1.484192 -0.89339424  0.001398552 -0.15297638 -0.01482445 -0.11644834
##            PC7         PC8         PC9
## 1  0.323219952  0.01379208  0.17753983
## 2 -1.076216470 -1.51907549 -0.41228215
## 3 -0.009033058 -0.05327904  0.08977257
## 4 -0.434501675 -0.16100360 -0.27624690
## 5 -0.343626465 -0.06744575  0.11803749
## 6  0.624902190  0.59072998  0.04522226
## 7 -0.426386456 -0.44736827  0.02014949
## 8  0.038064978  0.10931511 -0.15351054
pc_cluster$size 
## [1]  62   7 296  43  42  43  26 197
  #correlation matix
center <- pc_cluster$centers
cluster <- c(1:8)
centerdf <- data.frame(cluster, center)
head(centerdf)
##   cluster       PC1         PC2        PC3         PC4        PC5         PC6
## 1       1  4.827956  1.91841731 -0.4280706  0.24014651 -0.1211159 -0.25645429
## 2       2 -1.999867  4.71555786  1.3048535 -3.72829083 -0.8985313 -1.92865825
## 3       3 -1.187859 -0.36236013 -0.9332051 -0.03641196 -0.1101693  0.01717884
## 4       4  1.077969  0.09477732  3.1524515  0.12882713 -0.1906464  1.21091244
## 5       5 -1.969200  0.46210438  1.6185183 -0.14620149 -0.3177419  1.13539565
## 6       6 -3.689187  2.52431459  1.2864232 -0.30319942  0.1961991 -0.40404993
##            PC7         PC8         PC9
## 1  0.323219952  0.01379208  0.17753983
## 2 -1.076216470 -1.51907549 -0.41228215
## 3 -0.009033058 -0.05327904  0.08977257
## 4 -0.434501675 -0.16100360 -0.27624690
## 5 -0.343626465 -0.06744575  0.11803749
## 6  0.624902190  0.59072998  0.04522226
center_reshape <- gather(centerdf, features, values, PC1:PC9)

ggplot(data = center_reshape, aes(x = features, y = cluster, fill = values)) +
    scale_y_continuous(breaks = seq(1, 7, by = 1)) +
    geom_tile() +
    coord_equal() +
    theme_classic()